library(class)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(e1071)
library(naniar)
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

R Markdown

#Hello and welcome to Case Study 01, We are here to present findings from the Beers and Breweries datasets pulled from the company repository. We pulled these datasets into RStudio and conducted an extensive exploratory data analysis(EDA) for your review. We expect that Budweiser could benefit from our findings as you seek to get a greater understanding of the landscape of Breweries and Beer production in the US. For context, we looked at breweries and beers across the country by state and style with a specific focus on key beer characteristics including Alcohol By Volume and International Bitterness Unit.

#Read the Beers and Breweries data files in csv format

Beers = read.csv("/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit8/Beers.csv")
Breweries = read.csv("/Users/Banu/Documents/SMU MSDS/MSDS_6306_DoingDataScience/Unit8/Breweries.csv")

#Here are some of the questions that we asked as part of the EDA

#How many Breweries are present in each state #Group Breweries data by State #Plot a Bar Graph

Breweries %>% group_by(State) %>% dplyr::summarize(Breweries = n()) %>% print(n=51)
## # A tibble: 51 × 2
##    State Breweries
##    <chr>     <int>
##  1 " AK"         7
##  2 " AL"         3
##  3 " AR"         2
##  4 " AZ"        11
##  5 " CA"        39
##  6 " CO"        47
##  7 " CT"         8
##  8 " DC"         1
##  9 " DE"         2
## 10 " FL"        15
## 11 " GA"         7
## 12 " HI"         4
## 13 " IA"         5
## 14 " ID"         5
## 15 " IL"        18
## 16 " IN"        22
## 17 " KS"         3
## 18 " KY"         4
## 19 " LA"         5
## 20 " MA"        23
## 21 " MD"         7
## 22 " ME"         9
## 23 " MI"        32
## 24 " MN"        12
## 25 " MO"         9
## 26 " MS"         2
## 27 " MT"         9
## 28 " NC"        19
## 29 " ND"         1
## 30 " NE"         5
## 31 " NH"         3
## 32 " NJ"         3
## 33 " NM"         4
## 34 " NV"         2
## 35 " NY"        16
## 36 " OH"        15
## 37 " OK"         6
## 38 " OR"        29
## 39 " PA"        25
## 40 " RI"         5
## 41 " SC"         4
## 42 " SD"         1
## 43 " TN"         3
## 44 " TX"        28
## 45 " UT"         4
## 46 " VA"        16
## 47 " VT"        10
## 48 " WA"        23
## 49 " WI"        20
## 50 " WV"         1
## 51 " WY"         4
#Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries")

Breweries %>% ggplot(aes(x = State, fill = State)) + geom_bar() + ggtitle("Distribution of Breweries by State") + ylab(" # of Breweries") + geom_text(stat = "count", aes(label = after_stat(count)), vjust = 0) + theme(legend.position = "none") + xlab("State")

#visualization for the above question, as a heat map on the USA map

#install.packages("usmap")
library(usmap)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
StateBeerC = data.frame(state = c("AK","AL","AR","AZ","CA","CO","CT","DC","DE","FL","GA","HI","IA","ID","IL","IN","KS","KY","LA","MA","MD","ME","MI","MN","MO","MS","MT","NC","ND","NE","NH","NJ","NM","NV","NY","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VA","VT","WA","WI","WV","WY"),values = c(7,3,2,11,39,47,8,1,2,15,7,4,5,5,18,22,3,4,5,23,7,9,32,12,9,2,9,19,1,5,3,3,4,2,16,15,6,29,25,5,4,1,3,28,4,16,10,23,20,1,4))

p <- plot_usmap(data = StateBeerC, values = "values", regions = "state") + scale_fill_continuous(low = "yellow", high = "red", name = "Number of Breweries", label = scales::comma) + labs(title = "Number of Breweries By State", ) + theme(legend.position = "right")
ggplotly(p)

#Merge beer data with the breweries data. Print the first 6 observations and the last six observations to check the #merged file.

Breweries$Brewery_id = Breweries$Brew_ID
Breweries <- Breweries %>% select(Brewery_id, Name, City, State)
BB <- merge(Beers,Breweries, by = "Brewery_id", all = TRUE)
dim(BB)
## [1] 2410   10
summary(BB)
##    Brewery_id       Name.x             Beer_ID            ABV         
##  Min.   :  1.0   Length:2410        Min.   :   1.0   Min.   :0.00100  
##  1st Qu.: 94.0   Class :character   1st Qu.: 808.2   1st Qu.:0.05000  
##  Median :206.0   Mode  :character   Median :1453.5   Median :0.05600  
##  Mean   :232.7                      Mean   :1431.1   Mean   :0.05977  
##  3rd Qu.:367.0                      3rd Qu.:2075.8   3rd Qu.:0.06700  
##  Max.   :558.0                      Max.   :2692.0   Max.   :0.12800  
##                                                      NA's   :62       
##       IBU            Style               Ounces         Name.y         
##  Min.   :  4.00   Length:2410        Min.   : 8.40   Length:2410       
##  1st Qu.: 21.00   Class :character   1st Qu.:12.00   Class :character  
##  Median : 35.00   Mode  :character   Median :12.00   Mode  :character  
##  Mean   : 42.71                      Mean   :13.59                     
##  3rd Qu.: 64.00                      3rd Qu.:16.00                     
##  Max.   :138.00                      Max.   :32.00                     
##  NA's   :1005                                                          
##      City              State          
##  Length:2410        Length:2410       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
head(BB)
##   Brewery_id        Name.x Beer_ID   ABV IBU
## 1          1  Get Together    2692 0.045  50
## 2          1 Maggie's Leap    2691 0.049  26
## 3          1    Wall's End    2690 0.048  19
## 4          1       Pumpion    2689 0.060  38
## 5          1    Stronghold    2688 0.060  25
## 6          1   Parapet ESB    2687 0.056  47
##                                 Style Ounces             Name.y        City
## 1                        American IPA     16 NorthGate Brewing  Minneapolis
## 2                  Milk / Sweet Stout     16 NorthGate Brewing  Minneapolis
## 3                   English Brown Ale     16 NorthGate Brewing  Minneapolis
## 4                         Pumpkin Ale     16 NorthGate Brewing  Minneapolis
## 5                     American Porter     16 NorthGate Brewing  Minneapolis
## 6 Extra Special / Strong Bitter (ESB)     16 NorthGate Brewing  Minneapolis
##   State
## 1    MN
## 2    MN
## 3    MN
## 4    MN
## 5    MN
## 6    MN
tail(BB)
##      Brewery_id                    Name.x Beer_ID   ABV IBU
## 2405        556             Pilsner Ukiah      98 0.055  NA
## 2406        557  Heinnieweisse Weissebier      52 0.049  NA
## 2407        557           Snapperhead IPA      51 0.068  NA
## 2408        557         Moo Thunder Stout      50 0.049  NA
## 2409        557         Porkslap Pale Ale      49 0.043  NA
## 2410        558 Urban Wilderness Pale Ale      30 0.049  NA
##                        Style Ounces                        Name.y          City
## 2405         German Pilsener     12         Ukiah Brewing Company         Ukiah
## 2406              Hefeweizen     12       Butternuts Beer and Ale Garrattsville
## 2407            American IPA     12       Butternuts Beer and Ale Garrattsville
## 2408      Milk / Sweet Stout     12       Butternuts Beer and Ale Garrattsville
## 2409 American Pale Ale (APA)     12       Butternuts Beer and Ale Garrattsville
## 2410        English Pale Ale     12 Sleeping Lady Brewing Company     Anchorage
##      State
## 2405    CA
## 2406    NY
## 2407    NY
## 2408    NY
## 2409    NY
## 2410    AK

#Address the missing values in each column #https://www.masterclass.com/articles/ibu-beer #We read there are styles of beer and each style has a range of IBU, we could use the sytle of the beer to assign a #value #After doing the missing value analysis to figure out if its a MCAR/MAR/MNAR, we felt the values are missing completely at Random, they are not missing based on another variable or missing because of themselves. We did some reading online for ABV and IBU values and learnt that these values are a range based on the style of the Beer, so we used mean ABV and IBU by Style, to fill in the missing data and merged all the data points

gg_miss_var(BB) + ggtitle("Missing Values in Dataset")

MeanIBU <- BB %>% filter(!is.na(IBU)) %>% group_by(Style) %>% dplyr::summarize(IBUMean = mean(IBU))
BB <- merge(BB,MeanIBU, by="Style")
BB$IBU[is.na(BB$IBU)] <- BB$IBUMean[is.na(BB$IBU)]

MeanABV <- BB %>% filter(!is.na(ABV)) %>% group_by(Style) %>% dplyr::summarize(ABVMean=mean(ABV))
BB <- merge(BB,MeanABV, by="Style")
BB$ABV[is.na(BB$ABV)] <- BB$ABVMean[is.na(BB$ABV)]

#Compute the median alcohol content and international bitterness unit for each state. Plot a bar chart to compare.

BB %>%  ggplot(aes(x = State, y = IBU, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median IBU of Breweries by State") + ylab("International Bitterness Unit") + theme(legend.position = "none")

BB %>%  ggplot(aes(x = State, y = ABV, fill = State)) + geom_bar(position = "dodge", stat = "summary", fun="median") + ggtitle("Distribution of Median ABV of Breweries by State") + ylab("Alcohol By Volume") + theme(legend.position = "none")

#Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer? #Colorado has the maximum alcoholic beer and Oregon has the most bitter beeR #Finding States with Maximum IBU and ABV values

BB[which.max(BB$ABV),]
##                 Style Brewery_id
## 2130 Quadrupel (Quad)         52
##                                                    Name.x Beer_ID   ABV IBU
## 2130 Lee Hill Series Vol. 5 - Belgian Style Quadrupel Ale    2565 0.128  24
##      Ounces                  Name.y    City State IBUMean ABVMean
## 2130   19.2 Upslope Brewing Company Boulder    CO      24   0.104
BB[which.max(BB$IBU),]
##                              Style Brewery_id                    Name.x Beer_ID
## 429 American Double / Imperial IPA        375 Bitter Bitch Imperial IPA     980
##       ABV IBU Ounces                  Name.y    City State IBUMean    ABVMean
## 429 0.082 138     12 Astoria Brewing Company Astoria    OR   93.32 0.08736893

#Comment on the summary statistics and distribution of the ABV variable. #Outliers: There are few outliers in the ABV data. ​ #Skewness: ABV values have a right-skew distribution​ #Interquartile Range: 50th percentile of ABV values fall between 0.050 (Q1) and 0.067 (Q3). ​ # (IQR = Q3 – Q1 = 0.017)​ #Range: The measure of spread is large. ​

summary(BB$ABV)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02700 0.05000 0.05600 0.05972 0.06700 0.12800
sd(BB$ABV)
## [1] 0.0134303

#Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer. #I do not think that there is enough evidence to say that there is a linear relationship between ABV and IBU values. Looking at the 50th percentile of data, there is a linear relationship, but outside of that range there is no clear linear relationship and I am wondering if this is because of certain syles of beers

BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point(color = "red") +geom_smooth(position="jitter") + geom_smooth() + geom_smooth(method = lm) + ggtitle("Distribution of IBU vs ABV")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using formula = 'y ~ x'

#Trying another way of smoothing the curve, it does appear that there is a linear relationship between ABV #and IBU values

BB %>% ggplot(aes(x=ABV, y=IBU)) + geom_point() +geom_smooth(method=lm) + ggtitle("Distribution of IBU vs ABV")
## `geom_smooth()` using formula = 'y ~ x'

#Investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of #Ale (any beer with “Ale” in its name other than IPA) #Create a dataset for the two syles of Beers (IPA and Ale) #Plot the relationship between ABV and IBU for IPA and Ales. From the plot we can tell that IPAs have highter values of ABU and IBU.

BBIPAAle <- BB %>% filter(str_detect(Style, "IPA")|str_detect(Style, " Ale"))
BBIPAAle$AleType <- ifelse(str_detect(BBIPAAle$Style,"IPA"),"IPA","Ale")

BBIPAAle %>% ggplot(aes(x=ABV, y = IBU, color = AleType)) + geom_point() +ggtitle("Distribution of IBU vs ABV by Style (IPA vs Ale)")

#We would like to investigate the relationship further using the KNN Classification model. Below we re using the internal classification to find the accuracy of our predictions and we see that we have 91% accuracy using KNN internal classification model with a K value of 5

classifications = knn.cv(BBIPAAle[,c(5,6)],BBIPAAle$AleType, prob = TRUE, k = 5)
  table(classifications,BBIPAAle$AleType)
##                
## classifications Ale IPA
##             Ale 892  67
##             IPA  70 504
  CM = confusionMatrix(table(classifications,BBIPAAle$AleType))
  CM
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA
##             Ale 892  67
##             IPA  70 504
##                                           
##                Accuracy : 0.9106          
##                  95% CI : (0.8952, 0.9244)
##     No Information Rate : 0.6275          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.809           
##                                           
##  Mcnemar's Test P-Value : 0.8643          
##                                           
##             Sensitivity : 0.9272          
##             Specificity : 0.8827          
##          Pos Pred Value : 0.9301          
##          Neg Pred Value : 0.8780          
##              Prevalence : 0.6275          
##          Detection Rate : 0.5819          
##    Detection Prevalence : 0.6256          
##       Balanced Accuracy : 0.9049          
##                                           
##        'Positive' Class : Ale             
## 

#We would also like to investigate using the 70/30 Train/Test split. Here too, the accuracy is 89%

 trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
  train = BBIPAAle[trainI,]
  test = BBIPAAle[-trainI,]
  classifications = knn(train[,c(5,6)],test[,c(5,6)],train$AleType, prob = TRUE, k = 5)
  table(classifications,test$AleType)
##                
## classifications Ale IPA
##             Ale 265  29
##             IPA  17 149
  CM = confusionMatrix(table(classifications,test$AleType))
  CM
## Confusion Matrix and Statistics
## 
##                
## classifications Ale IPA
##             Ale 265  29
##             IPA  17 149
##                                           
##                Accuracy : 0.9             
##                  95% CI : (0.8689, 0.9259)
##     No Information Rate : 0.613           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7866          
##                                           
##  Mcnemar's Test P-Value : 0.1048          
##                                           
##             Sensitivity : 0.9397          
##             Specificity : 0.8371          
##          Pos Pred Value : 0.9014          
##          Neg Pred Value : 0.8976          
##              Prevalence : 0.6130          
##          Detection Rate : 0.5761          
##    Detection Prevalence : 0.6391          
##       Balanced Accuracy : 0.8884          
##                                           
##        'Positive' Class : Ale             
## 

#Investigating using another classification model Naive Bayes with a 70/30 Train/Test split to compare with KNN classification and here we get an accuracy of 85%

splitPerc = .7 #Training / Test split Percentage
trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
train = BBIPAAle[trainI,]
test = BBIPAAle[-trainI,]
model = naiveBayes(train[,c(5,6)],train$AleType)
table(predict(model,test[,c(5,6)]),test$AleType)
##      
##       Ale IPA
##   Ale 261  25
##   IPA  24 150
CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
CM
## Confusion Matrix and Statistics
## 
##      
##       Ale IPA
##   Ale 261  25
##   IPA  24 150
##                                           
##                Accuracy : 0.8935          
##                  95% CI : (0.8616, 0.9201)
##     No Information Rate : 0.6196          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.7738          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9158          
##             Specificity : 0.8571          
##          Pos Pred Value : 0.9126          
##          Neg Pred Value : 0.8621          
##              Prevalence : 0.6196          
##          Detection Rate : 0.5674          
##    Detection Prevalence : 0.6217          
##       Balanced Accuracy : 0.8865          
##                                           
##        'Positive' Class : Ale             
## 

#Tuning NB for 100 iterations

# NB Loop for average of many training / test partition

iterations = 100

masterAcc = matrix(nrow = iterations, ncol = 3)

splitPerc = .7 #Training / Test split Percentage

for(j in 1:iterations)
{
  
  trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
  train = BBIPAAle[trainI,]
  test = BBIPAAle[-trainI,]
  
  model = naiveBayes(train[,c(5,6)],train$AleType)
  table(predict(model,test[,c(5,6)]),test$AleType)
  CM = confusionMatrix(table(predict(model,test[,c(5,6)]),test$AleType))
  masterAcc[j,1] = CM$overall[1]
  masterAcc[j,2] = CM$byClass[1]
  masterAcc[j,3] = CM$byClass[2]
}
MeanAcc = colMeans(masterAcc)
MeanAcc
## [1] 0.8758696 0.8991731 0.8369258
accs = data.frame(accuracy = numeric(30), k = numeric(30))
 trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
  train = BBIPAAle[trainI,]
  test = BBIPAAle[-trainI,]
  
for(i in 1:30)
{
  classifications = knn(train[,c(5,6)],test[,c(5,6)],train$AleType, prob = TRUE, k = i)
  table(test$AleType,classifications)
  CM = confusionMatrix(table(test$AleType,classifications))
  accs$accuracy[i] = CM$overall[1]
  accs$k[i] = i
}

plot(accs$k,accs$accuracy, type = "l", xlab = "k")

which.max(accs$accuracy)
## [1] 1

#Tuning KNN for 100 iterations and 30 values of K

iterations = 100
numks = 30

masterAcc = matrix(nrow = iterations, ncol = numks)
  
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
 trainI = sample(seq(1:length(BBIPAAle$IBU)),round(.7*length(BBIPAAle$IBU)))
  train = BBIPAAle[trainI,]
  test = BBIPAAle[-trainI,]
for(i in 1:numks)
{
  classifications = knn(train[,c(5,6)],test[,c(5,6)],train$AleType, prob = TRUE, k = i)
  table(classifications,test$AleType)
  CM = confusionMatrix(table(classifications,test$AleType))
  masterAcc[j,i] = CM$overall[1]
}

}

MeanAcc = colMeans(masterAcc)
which.max(MeanAcc)
## [1] 5
mean(MeanAcc)
## [1] 0.8843667
plot(seq(1,numks,1),MeanAcc, type = "l",xlab = "K")

# With all the methods we can clearly see that Ale and IPA have a clear distingsion with respect to ABB and IBU values #Classifying Beers into 7 broad Categories Ale, IPA, Stout, Pilsner, Beer, Lager and Other #Since Budweiser beers are of Lager, comparing Lagers against IPA and Ales #Lager is a smaller section of Beers when compared to Ale and IPAs. Budweiser could look into capturing the market of IPAs an Ales

BBClassify <- BB
BBClassify$BeerType = ifelse(str_detect(BBClassify$Style, "IPA"),"IPA",ifelse(str_detect(BBClassify$Style, "Stout"), "Stout",ifelse(str_detect(BBClassify$Style, "Pilsner"),"Pilsner",ifelse(str_detect(BBClassify$Style, "Beer"),"Beer",ifelse(str_detect(BBClassify$Style, " Ale"),"Ale",ifelse(str_detect(BBClassify$Style, "Lager"),"Lager","Other"))))))

BBClassify %>% ggplot(aes(x=BeerType, fill= BeerType)) + geom_bar() + ggtitle("Distribution of Beers by Beer Type") + ylab("Beers") + xlab("Beer Type")

BBClassify %>% filter(BeerType == "IPA" |BeerType == "Ale" |BeerType == "Lager") %>% ggplot(aes(x=ABV, y = IBU, color = BeerType)) + geom_jitter() + ggtitle("Distribution of IBU vs ABV by Style (IPA vs Ale vs Lager)")